%% writefile lab / test.cpp

#include <chrono>
#include <iostream>
#include <CL/sycl.hpp>

#define random_float() (rand() / double(RAND_MAX))

using namespace std;
using namespace sycl;

#define tileY 8 
#define tileX 4

// return execution time
double gpu_kernel(float* A, float* B, float* C,
    int M, int N, int K,
    int BLOCK, sycl::queue& q) {

    // define the workgroup size and mapping
    auto grid_rows = M / tileY;
    auto grid_cols = N / tileX;
    auto local_ndrange = range<2>(BLOCK, BLOCK);
    auto global_ndrange = range<2>(grid_rows, grid_cols);

    double duration = 0.0f;

    auto e = q.submit([&](sycl::handler& h) {
        h.parallel_for<class k_name_t>(
            sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {

                int row = tileY * index.get_global_id(0);
                int col = tileX * index.get_global_id(1);

                float sum[tileY][tileX] = { 0.0f };
                float subA[tileY][2] = { 0.0f };
                float subB[tileX][2] = { 0.0f };

                // core computation
                for (int k = 0; k < N; k++) {

                    // read data to register
                    for (int m = 0; m < tileY; m += 2) {
                        subA[m][0] = A[(row + m) * N + k];
                        subA[m][1] = A[(row + m + 1) * N + k];
                    }

                    for (int p = 0; p < tileX; p += 2) {
                        subB[p][0] = B[k * N + p + col];
                        subB[p][1] = B[k * N + p + col + 1];
                    }

                    for (int m = 0; m < tileY; m += 2) {
                        for (int p = 0; p < tileX; p += 2) {
                            sum[m][p] += subA[m][0] * subB[p][0];
                            sum[m + 1][p + 1] += subA[m][1] * subB[p][1];
                        }
                    }

                } //end of K

                // write results back
                for (int m = 0; m < tileY; m++) {
                    for (int p = 0; p < tileX; p++) {
                        C[(row + m) * N + col + p] = sum[m][p];
                    }
                }

            });
        });
    e.wait();

    duration += (e.get_profiling_info<info::event_profiling::command_end>() -
        e.get_profiling_info<info::event_profiling::command_start>()) / 1000.0f / 1000.0f;

    return(duration);
}

// return execution time
double cpu_kernel(float* cA, float* cB, float* cC, int M, int N, int K) {

    double duration = 0.0;
    std::chrono::high_resolution_clock::time_point s, e;

    // Single Thread Computation in CPU 
    s = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < K; k++) {
                sum += cA[i * K + k] * cB[k * N + j];
            }
            cC[i * N + j] = sum;
        }
    }
    e = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration<float, std::milli>(e - s).count();

    return(duration);
}

int verify(float* cpu_res, float* gpu_res, int length) {
    int err = 0;
    for (int i = 0; i < length; i++) {
        if (fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
            err++;
            //printf("\n%lf, %lf", cpu_res[i], gpu_res[i]);
        }
    }
    return(err);
}

int gemm(const int M,
    const int N,
    const int K,
    const int block_size,
    const int iterations,
    sycl::queue& q) {

    cout << "Problem size: c(" << M << "," << N << ") ="
        << " a(" << M << "," << K << ") *"
        << " b(" << K << "," << N << ")\n";

    auto A = malloc_shared<float>(M * K, q);
    auto B = malloc_shared<float>(K * N, q);
    auto C = malloc_shared<float>(M * N, q);
    auto C_host = malloc_host<float>(M * N, q);

    // init the A/B/C
    for (int i = 0; i < M * K; i++) {
        A[i] = random_float();
    }

    for (int i = 0; i < K * N; i++) {
        B[i] = random_float();
    }

    for (int i = 0; i < M * N; i++) {
        C[i] = 0.0f;
        C_host[i] = 0.0f;
    }

    double flopsPerMatrixMul
        = 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);

    double duration_gpu = 0.0f;
    double duration_cpu = 0.0f;

    // GPU compuation and timer 
    int warmup = 10;
    for (int run = 0; run < iterations + warmup; run++) {
        float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
        if (run >= warmup) duration_gpu += duration;
    }
    duration_gpu = duration_gpu / iterations;

    // CPU compuation and timer 
    warmup = 2;
    for (int run = 0; run < iterations / 2 + warmup; run++) {
        float duration = cpu_kernel(A, B, C_host, M, N, K);
        if (run >= warmup) duration_cpu += duration;
    }
    duration_cpu = duration_cpu / iterations / 2;

    // Compare the resutls of CPU and GPU 
    int errCode = 0;
    errCode = verify(C_host, C, M * N);
    if (errCode > 0) printf("\nThere are %d errors\n", errCode);

    printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K);
    printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY);
    printf("\nPerformance Flops = %lf, \n"
        "GPU Computation Time = %lf (ms); \n"
        "CPU Computaiton Time = %lf (ms); \n",
        flopsPerMatrixMul, duration_gpu, duration_cpu);

    free(A, q);
    free(B, q);
    free(C, q);
    free(C_host, q);

    return(errCode);
}

int main() {

    auto propList = cl::sycl::property_list{ cl::sycl::property::queue::enable_profiling() };
    queue my_gpu_queue(cl::sycl::gpu_selector{}, propList);

    int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */
        4,             /* workgroup size */
        10,            /* repeat time */
        my_gpu_queue);

    return(errCode);
}
